---
title: "Cleaning DHS data"
author: "Anne Christine Bischops"
date: "18 Januar 2018"
output: html_document
---
---

Last updated: 2017-02-20


Load libraries.
```{r load packages}
library(tidyverse) 
library(haven) # part of the tidyverse for reading in stata; should not need to load it in addition to tidyverse
library(dplyr) 
library(forcats) # for categorical variables (R for data science rec) --> see https://rdrr.io/cran/forcats/man/fct_unify.html
library(stringr) # for manipulating string variables (R for data science rec)
library(data.table) # this has some useful features like rbindlist
#library(lubridate) # for dates and times (R for data science rec)
#instalibrary(dummies) # to easily create dummies
library(ggplot2) 
library(ggrepel) # to avoid text labels in ggplot from overlapping
library(modelr) # to use "add_predictions()" for adding a column of predicted vals to your dataset
library(broom) # to create tidy data from model output
#library(margins) # R equivalent of Stata's margins command --> Thomas Leeper said this only to be used for marginal effects (not prediction)
#library(prediction) # Thomas Leeper's R package to get predicted probabilities
library(srvyr)  # survey package that also works with dplyr 
library(lmtest) # for likelihood ratio tests
#library(sandwich) # for robust standard errors 
#library(multiwayvcov) # for clustered standard errors
library(miceadds) # package to cluster SEs more easily than in multiwayvcov; it uses multiwayvcov, so the results between the two packages are exactly the same. 
#library(glmmML)  # Allows for fast computation of logits and poissons with large number of fixed effects
library(speedglm)
#library(data.table) # this has some useful features like rbindlist
#library(lme4) # for multi-level modeling
#library(lmerTest) # for p-values with the lmer command
#library(sjPlot) # for plotting lmer models
#library(texreg) # for tables
library(tableone) # Creates a table 1 (summary characteristics)
#library(mice) # md.pattern() function to see patterns of missing data 
library(reshape) # to use the rescalar function

#library(car) # for easy attaching of new variables
#library(arm)
#library(mosaic)
#library(mosaicData)
#library(mediation)  # for mediation analysis
#library(lattice)
#library(pander)

```



In India, all men aged 15-54 and women aged 15-49 were eligible for the blood glucose and blood pressure measurement according to the questionnaire(%missing must be calculated).
No district id, but cluster id.
weights/1000000

Fasting status: Blood glucose was measured randomly, but participants were asked the time when they last ate and drank.
Smoking is asked, but only categorical questions of different types of cigarettes/cigars etc. Alcohol consumption at last sex is only questioned and consumption of husband, so I left this out. Fruits are only asked in the womens questionnaire in combination with motherhood, so I left this out. Vigorous activity at work was not asked.

601,509 households with completed interviews (628,900 sampled)
699,686 women with completed interviews (723,875 sampled)
112,122 men with completed interviews (122,051 sampled)
Weighted by state and urban/rural, and within major cities by slum/non-slum 
The Biomarker Schedule covered measurements of height, weight and haemoglobin levels for children; measurements of height, weight, haemoglobin levels, blood pressure, and random blood glucose level for women aged 15-49 years and men aged 15-54 years


```{r India 2015-2016, eval=TRUE}

#### WOMen
india.women <- read_spss("IAIR71FL.SAV")

#test.women <-head(india.women,n=20)


india.women <- india.women %>% 
  mutate( Country = "India", 
          year = "2015-2016", 
          svy = "DHS",
                        psu = V021,
                        d_id=str_c(as.character(V024), as.character(SDISTRI), sep="_"),
                        ex_state_ind=V024,
                        c_id=V001,
                        stratum = V023, 
                        hh_id = str_c(as.character(V021), as.character(V002), sep="_"),
                        p_id=CASEID,
                        p_wt = V005/1000000, 
                        sex = 1,
                        dob=str_c(as.character(V009),as.character(V010),sep="/"),
                        age_sr =  V012,
                        age= V012,
                        age_5yr= V013,
                        age_10yr= ifelse(V012>=15& V012<=24,1,
                                         ifelse(V012>=25&V012<=34,2,
                                                ifelse(V012>=35 &V012<=44,3,
                                                      ifelse( V012>=45 & V012 <=54,4,
                                                              ifelse(V012>=55 &V012<=64,5,
                                                                     ifelse(V012>=65,6,NA)))))),
                        edyears = V133,
                        educat = ifelse(V149==0, 0,
                                    ifelse(V149==1, 1, 
                                        ifelse(V149==2, 2,
                                             ifelse(V149==3, 3, 
                                                  ifelse(V149==4 | V149==5, 4, NA))))),
                        #educat_lcl:Educational attainment: 0No education,1Incomplete primary,2Complete primary,3Incomplete secondary,4Complete secondary,5Higher,9Missing 
                        educat_lcl = ifelse(V149==0, 1,
                                    ifelse(V149==1, 2, 
                                        ifelse(V149==2, 3,
                                             ifelse(V149==3, 4, 
                                                  ifelse(V149==4,5,
                                                         ifelse(V149==5, 6,
                                                                ifelse(V149==9,888888888,NA))))))),
                        race=V131,
                        marital=ifelse(V501==0,1,
                                       ifelse(V501==1,2,
                                              ifelse(V501==5,3,
                                                     ifelse(V501==4,4,
                                                            ifelse(V501==3,5,
                                                                   ifelse(V501==2,6,
                                                                          ifelse(V501==9,888888888,NA))))))),
                        working=V714,
                        total_hh=V136,
                        asset_index = V191,
                        wealth_quintile = V190,
                        wealth_quintile_r=S190R,
                        wealth_quintile_urb=S190U,#should be one variable
                        csmoke=ifelse(V463Z==0,1,
                                      ifelse(V463Z==1,0,
                                      ifelse(V463Z==9,NA,NA))),
                        othersmkls=ifelse(V463D==1,"snuff use",
                                          ifelse(V463C==1,"chewing tobacco use",
                                                 ifelse(V463F==1,"Gutkha/Paan Masala with tobacco",
                                                        ifelse(V463G==1,"Paan with tobacco",
                                                              ifelse(S710E==1,"Khaini",NA))))),
                        csmkls_tb=ifelse(V463C==1 |V463D==1 |V463F==1|V463G==1|S710E==1,1,
                                          ifelse(V463C==0& V463D==0&V463F==0&V463G==0&S710E==0,0,NA)),
                        try_qsmk=S712,
                        advised_qsm=S714,
                        home_smk=S715,
                        ever_alc=S716,#question: do you drink alcohol?
                        alc_freq=ifelse(S717==1,1,
                                        ifelse(S717==2,4,
                                               ifelse(S717==3,5,NA))),#actually 1= almost once a week,2 once a week, 3 less than once a week changed to 1=daily,4=1-2 days a week ,5=1-3 per month-->correct??
                        bp_ms=SB17,
                        hypt = ifelse(SB18==1, 1, 
                                      ifelse(SB18==0, 0, NA)), # ever told hypertension
                        hypt_med = ifelse(SB19==1, 1, 
                                      ifelse(SB19==0, 0, NA)),#same as bp_med
                        sbp1 = SB16S, 
                        sbp2 = SB23S, 
                        sbp3 = SB27S, 
                        dbp1 = SB16D,  
                        dbp2 = SB23D,
                        dbp3 =SB27D,
                        sbp_nmeasures=ifelse(is.na(sbp1)==FALSE &is.na(sbp2)==FALSE & is.na(sbp3)==FALSE,3,
                                            ifelse(is.na(sbp1)==FALSE &is.na(sbp2)==TRUE & is.na(sbp3)==TRUE,1,
                                            ifelse(is.na(sbp1)==FALSE&is.na(sbp2)==FALSE & is.na(sbp3) ==TRUE,2,0))),
                        dbp_nmeasures=ifelse(is.na(dbp1)==FALSE & is.na(dbp2)==FALSE & is.na(dbp3)==FALSE,3,ifelse(is.na(dbp1)==FALSE &is.na(dbp2)==TRUE & is.na(dbp3)==TRUE,1,
ifelse(is.na(dbp1)==FALSE&is.na(dbp2)==FALSE & is.na(dbp3) ==TRUE,2,0))),
                        hbg12=S723A,#actually "currently has diabetes",
                        ex_dia_med_ind=S723AB,#"has sought treatment for diabetes",
                        fast=ifelse(SB51>=12 & SB52 >= 12,1,0),
                        tbg=str_c(as.character(SB69H), as.character(SB69M), sep=":"),
                        fbg=SB70*0.0555,# also includes not fasted, to convert into mmol/l,
                        ex_glucose_ind=SB70,
                        cervical=S727A,# had cervix examination
                        pregnant = ifelse(V213 !=1 | is.na(V213)==T, 0, 1),
                        ht=V438,
                        wt=V437,
                        bmi=(V437*0.1)/(V438*0.001)^2 ,
                        bmicat=ifelse( bmi<18.5,1,
                                       ifelse(bmi>=18.5& bmi<25,2,
                                              ifelse(bmi>=25 & bmi<30,3,
                                                     ifelse(bmi>30,4,NA)))),
                        overweight=ifelse(bmi>=25,1,
                                          ifelse(bmi<25,0,NA)),
                        obese=ifelse(bmi>=30,1,
                                       ifelse(bmi<30,0,NA)),
                        ex_hb_ind=V453,
                        ex_hb_adj_ind=V456,
                        ex_anemia_ind=V457,
                        ex_bpprior_eaten_ind=SB12A,
                        ex_bpprior_caffeine_ind=SB12B,
                        ex_bpprior_smoked_ind=SB12C,
                        ex_bpprior_othertobacco_ind=SB12D,
                      
                        # vars not in codebook: 
                        mergeid = str_c(as.character(V001), as.character(V002), as.character(V003), sep="_"),
                        urban = ifelse(V025==1, 1, 
                                       ifelse(V025==2, 0, NA)),
                        visitor = ifelse(V135==2, 1, 0),
                        ineligible = ifelse(is.na(SCONSENT)==T, 1, 0)) %>% 
  dplyr::select(Country, year, svy, psu, stratum,d_id,ex_state_ind,c_id, hh_id,p_id, p_wt, sex,dob, age,age_sr,age_5yr,age_10yr, edyears, educat,educat_lcl,race,marital,working,total_hh, asset_index, wealth_quintile,wealth_quintile_r,wealth_quintile_urb,csmoke,try_qsmk,advised_qsm,othersmkls,csmkls_tb,home_smk,ever_alc,alc_freq,bp_ms, 
                hypt, hypt_med, sbp1, sbp2, sbp3, dbp1, dbp2, dbp3,sbp_nmeasures,dbp_nmeasures,ex_dia_med_ind,hbg12,fast,tbg,fbg,cervical,pregnant,ht,wt,bmi,bmicat,overweight,obese,ex_hb_ind,ex_hb_adj_ind,ex_anemia_ind, mergeid, urban, visitor, ineligible,ex_hb_ind,ex_hb_adj_ind,ex_anemia_ind,ex_bpprior_eaten_ind,ex_bpprior_caffeine_ind,ex_bpprior_smoked_ind,ex_bpprior_othertobacco_ind,ex_glucose_ind)

```

```{r}

##### MEN                               
india.men <- read_spss("IAMR71FL.SAV")

test.men <-head(india.men,n=20)

india.men <- india.men %>% 
  mutate( 
                        Country = "India", 
                        year = "2015-2016", 
                        svy = "DHS",
                        psu = MV021, 
                        stratum = MV023,
                        d_id = str_c(as.character(MV024), as.character(SMDISTRI), sep="_"),
                        ex_state_ind= MV024,
                        c_id= MV001,
                        hh_id = str_c(as.character(MV021), as.character(MV002), sep="_"),
                        p_id= MCASEID,
                        p_wt = MV005/1000000, 
                        sex = 0,
                        dob=str_c(as.character(MV009),as.character(MV010),sep="/"),
                        age_sr =  MV012,
                        age= MV012,
                        age_5yr= MV013,
                        age_10yr= ifelse(MV012>=15& MV012<=24,1,
                                         ifelse(MV012>=25&MV012<=34,2,
                                                ifelse(MV012>=35 &MV012<=44,3,
                                                      ifelse( MV012>=45 & MV012 <=54,4,
                                                              ifelse(MV012>=55 &MV012<=64,5,ifelse(MV012>=65,6,NA)))))),
                        edyears = MV133,
                        educat = ifelse(MV149==0, 0,
                                    ifelse(MV149==1, 1, 
                                        ifelse(MV149==2, 2,
                                             ifelse(MV149==3, 3, 
                                                  ifelse(MV149==4 | MV149==5, 4, NA))))),
                         #educat_lcl:Educational attainment: 0No education,1Incomplete primary,2Complete primary,3Incomplete secondary,4Complete secondary,5Higher,9Missing 
                        educat_lcl = ifelse(MV149==0, 1,
                                    ifelse(MV149==1, 2, 
                                        ifelse(MV149==2, 3,
                                             ifelse(MV149==3, 4, 
                                                  ifelse(MV149==4,5,
                                                         ifelse(MV149==5, 6,
                                                                ifelse(MV149==9,888888888,NA))))))),
                        race=MV131,
                        marital=ifelse(MV501==0,1,
                                       ifelse(MV501==1,2,
                                              ifelse(MV501==5,3,
                                                     ifelse(MV501==4,4,
                                                            ifelse(MV501==3,5,
                                                                   ifelse(MV501==2,6,
                                                                          ifelse(MV501==9,888888888,NA))))))),
                        working=MV714,
                        total_hh=MV136,
                        asset_index = MV191,
                        wealth_quintile = MV190,
                        wealth_quintile_r=SM190R,
                        wealth_quintile_urb=SM190U, #should be one variable
                        csmoke=ifelse(MV463Z==0,1,
                                      ifelse(MV463Z==1,0,
                                      ifelse(MV463Z==9,NA,NA))),
                        othersmkls=ifelse(MV463D==1,"snuff use",
                                          ifelse(MV463C==1,"chewing tobacco use",
                                                 ifelse(MV463F==1,"Gutkha/Paan Masala with tobacco",
                                                        ifelse(MV463G==1,"Paan with tobacco",
                                                              ifelse(SM609E==1,"Khaini",NA))))),
                        csmkls_tb=ifelse(MV463C==1 |MV463D==1 |MV463F==1|MV463G==1|SM609E==1,1,
                                          ifelse(MV463C==0& MV463D==0&MV463F==0&MV463G==0&SM609E==0,0,NA)),
                        try_qsmk=SM611,
                        advised_qsm=SM613,
                        home_smk=SM614,
                        ever_alc=SM615,#question: do you drink alcohol? or rather ever_alc?
                        alc_freq=ifelse(SM616==1,1,
                                        ifelse(SM616==2,4,
                                               ifelse(SM616==3,5,NA))),#actually 1= almost once a week,2 once a week, 3 less than once a week changed to 1=daily,4=1-2 days a week ,5=1-3 per month-->correct??
                        bp_ms=SMB17,
                        hypt = ifelse(SMB18==1, 1, 
                                      ifelse(SMB18==0, 0, NA)), # ever told hypertension on two or more occasions by doctor or other health professional
                        hypt_med = ifelse(SMB19==1, 1, 
                                      ifelse(SMB19==0, 0, NA)),#same as bp_med
                        sbp1 = SMB16S, 
                        sbp2 =SMB23S, 
                        sbp3 =SMB27S, 
                        dbp1 = SMB16D,  
                        dbp2 = SMB23D,
                        dbp3 =SMB27D,
                        sbp_nmeasures=ifelse(is.na(sbp1)==FALSE &is.na(sbp2)==FALSE & is.na(sbp3)==FALSE,3,
                                            ifelse(is.na(sbp1)==FALSE &is.na(sbp2)==TRUE & is.na(sbp3)==TRUE,1,
                                            ifelse(is.na(sbp1)==FALSE&is.na(sbp2)==FALSE & is.na(sbp3) ==TRUE,2,0))),
                        dbp_nmeasures=ifelse(is.na(dbp1)==FALSE & is.na(dbp2)==FALSE & is.na(dbp3)==FALSE,3,ifelse(is.na(dbp1)==FALSE &is.na(dbp2)==TRUE & is.na(dbp3)==TRUE,1,
ifelse(is.na(dbp1)==FALSE&is.na(dbp2)==FALSE & is.na(dbp3) ==TRUE,2,0))),
                        hbg12=SM622A,#actually "currently has diabetes",
                        ex_dia_med_ind=SM622AB,#"has sought treatment for diabetes",
                        fast=ifelse(SMB51>=12 & SMB52 >= 12,1,0),
                        tbg=str_c(as.character(SMB69H), as.character(SMB69M), sep=":"),
                        fbg=(SMB70*0.0555),# also includes not fasted, to convert into mmol/l,
                        ex_glucose_ind=SMB70,
                        cervical=0,# had cervix examination
                        pregnant= 0,
                        ex_bpprior_eaten_ind=SMB12A,
                        ex_bpprior_caffeine_ind=SMB12B,
                        ex_bpprior_smoked_ind=SMB12C,
                        ex_bpprior_othertobacco_ind=SMB12D,
                        #mens height, weight and hemoglobin is in hh survey
                        
                        # vars not in codebook: #
                        mergeid = str_c(as.character(MV001), as.character(MV002), as.character(MV003), sep="_"),
                        urban = ifelse(MV025==1, 1, 
                                       ifelse(MV025==2, 0, NA)),
                        visitor = ifelse(MV135==2, 1, 0),
                        ineligible = ifelse(is.na(SMCONSENT)==T, 1, 0)) %>% 
  dplyr::select(Country, year, svy, psu, stratum,d_id,ex_state_ind,c_id, hh_id,p_id, p_wt, sex,dob, age,age_sr,age_5yr,age_10yr, edyears, educat,educat_lcl,race,marital,working,total_hh, asset_index, wealth_quintile,wealth_quintile_r,wealth_quintile_urb,csmoke,try_qsmk,advised_qsm,othersmkls,csmkls_tb,home_smk,ever_alc,alc_freq,bp_ms, 
                hypt, hypt_med, sbp1, sbp2, sbp3, dbp1, dbp2, dbp3,sbp_nmeasures,dbp_nmeasures,ex_dia_med_ind,hbg12,fast,tbg,fbg,cervical,pregnant, mergeid, urban, visitor,ineligible,ex_bpprior_eaten_ind,ex_bpprior_caffeine_ind,ex_bpprior_smoked_ind,ex_bpprior_othertobacco_ind,ex_glucose_ind)


# household member recode
india.hh <- read_spss("IAPR71FL.SAV")

#test.hh <-head(india.hh,n=20)

india.hh <- india.hh %>% 
  mutate( 
                     mergeid = str_c(as.character(HV001), as.character(HV002), as.character(HVIDX), sep="_"),
                     ht=HB3,
                     hh_wt=HV005,
                     wt=HB2,
                     doi=str_c(as.character(HV006),as.character(HV007),sep="/"), #date of interview, not in codebook, for age_c
                    bmi=(HB2*0.1)/(HB3*0.001)^2 ,
                    bmicat=ifelse( bmi<18.5,1,
                                       ifelse(bmi>=18.5& bmi<25,2,
                                              ifelse(bmi>=25 &bmi<30,3,
                                                     ifelse(bmi>30,4,NA)))),
                        overweight=ifelse(bmi>=25,1,
                                          ifelse(bmi<25,0,NA)),
                        obese=ifelse(bmi>=30,1,
                                       ifelse(bmi<30,0,NA)),
                        ex_hb_ind=HB53,
                        ex_hb_adj_ind=HB56,
                        ex_anemia_ind=HB57 ) %>% 
  dplyr::select(mergeid,hh_wt,ht,wt,bmi,bmicat,overweight,obese,ex_hb_ind,ex_hb_adj_ind,ex_anemia_ind,doi)

# Append and merge
#india.hh= 2869043 obs of 13 variables
#india.women=699686 obs of 64 variables
#india.men=112122 obs of 55 variables
#without filter 811808 observations,with filter 757655
india.ind <- bind_rows(india.women, india.men)
india <- left_join(india.ind, india.hh, by=c("mergeid"="mergeid"))

# when merging, hh_wt and doi get lost if joining also by ht/wt etc., so instead I only join by mergeid and join the other variables later on:
india$ht <-ifelse(is.na(india$ht.y)==TRUE,india$ht.x,india$ht.y)
india$wt <-ifelse(is.na(india$wt.y)==TRUE,india$wt.x,india$wt.y)
india$bmi <-ifelse(is.na(india$bmi.y)==TRUE,india$bmi.x,india$bmi.y)
india$bmicat <-ifelse(is.na(india$bmicat.y)==TRUE,india$bmicat.x,india$bmicat.y)
india$overweight <-ifelse(is.na(india$overweight.y)==TRUE,india$overweight.x,india$overweight.y)
india$obese <-ifelse(is.na(india$obese.y)==TRUE,india$obese.x,india$obese.y)
india$ex_hb_ind <-ifelse(is.na(india$ex_hb_ind.y)==TRUE,india$ex_hb_ind.x,india$ex_hb_ind.y)
india$ex_hb_adj_ind <-ifelse(is.na(india$ex_hb_adj_ind.y)==TRUE,india$ex_hb_adj_ind.x,india$ex_hb_adj_ind.y)
india$ex_anemia_ind <-ifelse(is.na(india$ex_anemia_ind.y)==TRUE,india$ex_anemia_ind.x,india$ex_anemia_ind.y)


india <-india %>%
 dplyr::select(-ht.x,-ht.y,-wt.x,-wt.y,-bmi.x,-bmi.y,-bmicat.x,-bmicat.y,-overweight.x,-overweight.y,-obese.x,-obese.y,-ex_hb_ind.x,-ex_hb_ind.y,-ex_hb_adj_ind.x,-ex_hb_adj_ind.y,-ex_anemia_ind.x,-ex_anemia_ind.y)
india <- filter(india, pregnant==0 & ((sex==0 & visitor==0) | (sex==1 & visitor==0 & ineligible==0)))

```


```{r Clean aggregate dataset}
 

# Convert country to factor
india <- india %>% 
  mutate(
    Country = as.factor(Country))

# ***** Clean implausible values ***** 
india <- india %>% 
  mutate(
    edyears = ifelse(edyears>=97, NA, edyears),
    tbg=ifelse(tbg>=96:96,NA,tbg),
    fbg=ifelse(fbg>=995,NA,fbg),
    ht=ifelse(ht>=9995,NA,ht),
    wt=ifelse(wt>=9994,NA,wt),
    sbp1 = ifelse(sbp1>240 | sbp1<70, NA, sbp1),
    sbp2 = ifelse(sbp2>240 | sbp2<70, NA, sbp2),
    sbp3 = ifelse(sbp3>240 | sbp3<70, NA, sbp3),
    dbp1 = ifelse(dbp1>130 | dbp1<40, NA, dbp1),
    dbp2 = ifelse(dbp2>130 | dbp2<40, NA, dbp2),
    dbp3 = ifelse(dbp3>130 | dbp3<40, NA, dbp3))

    

# ***** Create hypertension variables ***** 
india <- india %>% 
  mutate(
    sbp_avg = 
      ifelse(is.na(sbp1)==F & is.na(sbp2)==F & is.na(sbp3)==F, (sbp1+sbp2+sbp3)/3,
        ifelse(is.na(sbp1)==T & is.na(sbp2)==F & is.na(sbp3)==F, (sbp2+sbp3)/2,
           ifelse(is.na(sbp1)==F & is.na(sbp2)==T & is.na(sbp3)==F, (sbp1+sbp3)/2,
              ifelse(is.na(sbp1)==F & is.na(sbp2)==F & is.na(sbp3)==T, (sbp1+sbp2)/2,
                  ifelse(is.na(sbp1)==F & is.na(sbp2)==T & is.na(sbp3)==T, sbp1,
                      ifelse(is.na(sbp1)==T & is.na(sbp2)==F & is.na(sbp3)==T, sbp2,
                          ifelse(is.na(sbp1)==T & is.na(sbp2)==T & is.na(sbp3)==F, sbp3, NA
        ))))))),
    dbp_avg = 
      ifelse(is.na(dbp1)==F & is.na(dbp2)==F & is.na(dbp3)==F, (dbp1+dbp2+dbp3)/3,
        ifelse(is.na(dbp1)==T & is.na(dbp2)==F & is.na(dbp3)==F, (dbp2+dbp3)/2,
           ifelse(is.na(dbp1)==F & is.na(dbp2)==T & is.na(dbp3)==F, (dbp1+dbp3)/2,
              ifelse(is.na(dbp1)==F & is.na(dbp2)==F & is.na(dbp3)==T, (dbp1+dbp2)/2,
                  ifelse(is.na(dbp1)==F & is.na(dbp2)==T & is.na(dbp3)==T, dbp1,
                      ifelse(is.na(dbp1)==T & is.na(dbp2)==F & is.na(dbp3)==T, dbp2,
                          ifelse(is.na(dbp1)==T & is.na(dbp2)==T & is.na(dbp3)==F, dbp3, NA
        ))))))))


# *****Create new bmi and clean BMI and hemoglobine *****

india <- india %>%
  mutate(bmi=(wt*0.1)/(ht*0.001)^2)
india <- india %>% 
  mutate( bmi = ifelse(bmi<10 | bmi>80, NA, bmi),
          bmicat=ifelse( bmi<18.5,1,
                                       ifelse(bmi>=18.5& bmi<25,2,
                                              ifelse(bmi>=25 &bmi<30,3,
                                                     ifelse(bmi>30,4,NA)))),
          overweight=ifelse(bmi>=25,1,
                                          ifelse(bmi<25,0,NA)),
                        obese=ifelse(bmi>=30,1,
                                       ifelse(bmi<30,0,NA)))

india$ex_hb_ind <-india$ex_hb_ind*0.1
india <-india %>%
  mutate(ex_hb_ind=ifelse(ex_hb_ind <1 |ex_hb_ind >25,NA,ex_hb_ind))

india$ex_hb_adj_ind <-india$ex_hb_adj_ind*0.1
india <-india %>%
  mutate(ex_hb_adj_ind=ifelse(ex_hb_adj_ind <1 |ex_hb_adj_ind >25,NA,ex_hb_ind))


#***** Create diab_broad/narrow and htn_broad/narrow variables, corrected 

india <- mutate(india,
              ex_diab_narrow_ind = ifelse( is.na(ex_glucose_ind)==T, NA,
                                           ifelse(ex_glucose_ind>=200, 1,
                                                  ifelse(fast==1 & ex_glucose_ind>=126, 1, 0))))

india <-mutate(india,
             ex_diab_broad_ind = ifelse(is.na(ex_diab_narrow_ind)==T | is.na(hbg12) ==T, NA, ifelse(hbg12==1 | ex_diab_narrow_ind==1, 1, 0)))

# hypertension-definition

india <- mutate(india,ex_htn_narrow_ind=ifelse(is.na(sbp_avg)==T | is.na(dbp_avg)==T, NA, ifelse(sbp_avg>=140 | dbp_avg >=90, 1, 0)))


india<-mutate(india,ex_htn_broad_ind = ifelse(is.na(ex_htn_narrow_ind)==T | is.na(hypt) ==T | is.na(hypt_med)==T, NA,ifelse(hypt_med==1 | hypt==1 | ex_htn_narrow_ind==1, 1, 0))
  

#diab_narrow is defined as fasted blood glucose>126 and non fasted blood glucose>200, diab_broad is either having a high blood glucose(diab_narrow), self-reported diabetes("currently having diabetes,hbg12") or positive answer to "has sought treatment for diabetes")
#htn_narrow is defined as average systolic bp >140 or diastolic bp >90, htn_broad additionally includes self-reported hypertension("told had high bp by doctor or other health professional on two or more occasions") and positive answer to "currently taking a prescribed medicine to lower BP"
  


#**** Create age_c with calculated age from dob and doi********

#calculated in excel

# *** Create wealth_quintile_rurb from _r and _urb columns***
  
india$wealth_quintile_rurb <-ifelse(is.na(india$wealth_quintile_urb),india$wealth_quintile_r,india$wealth_quintile_urb)

india <-india %>%
 dplyr::select(-wealth_quintile_r,-wealth_quintile_urb)


# This step is necessary because the DHS states for some reason came without labels(MAP-code)

india <- mutate(india,
                    ex_state_ind = ifelse(ex_state_ind==1, "Andaman and Nicobar Islands",
                                ifelse(ex_state_ind==2, "Andhra Pradesh",
                                   ifelse(ex_state_ind==3, "Arunachal Pradesh",
                                          ifelse(ex_state_ind==4, "Assam",
                                                 ifelse(ex_state_ind==5,"Bihar",
                                                 ifelse(ex_state_ind==6, "Chandigarh",
                                                        ifelse(ex_state_ind==7, "Chhattisgarh",
                                                               ifelse(ex_state_ind==8,"Dadra and Nagar Haveli",
                                                                      ifelse(ex_state_ind==9,"Daman and Diu",
                                                                             ifelse(ex_state_ind==10,"Goa",
                                                               ifelse(ex_state_ind==11, "Gujarat",
                                                                      ifelse(ex_state_ind==12, "Haryana",
                                                                             ifelse(ex_state_ind==13, "Himachal Pradesh",
                                                                                    ifelse(ex_state_ind==14, "Jammu and Kashmir",
                                                                                           ifelse(ex_state_ind==15, "Jharkhand",
                                                                                                  ifelse(ex_state_ind==16, "Karnataka",
                                                                                                         ifelse(ex_state_ind==17, "Kerala",
                                                                                                                ifelse(ex_state_ind==18,"Lakshadweep",
                                                                                                                ifelse(ex_state_ind==19, "Madhya Pradesh",
                                                                                                                       ifelse(ex_state_ind==20,"Maharashtra",
                                                                                                                              ifelse(ex_state_ind==21,"Manipur",
                                                                                                                                     ifelse(ex_state_ind==22,"Meghalaya",
                                                                                                                                            ifelse(ex_state_ind==23,"Mizoram",
                                                                                                                                                   ifelse(ex_state_ind==24,"Nagaland",
                                                                                                                       ifelse(ex_state_ind==25, "Delhi",
                                                                                                                              ifelse(ex_state_ind==26,"Odisha",
                                                                                                                              ifelse(ex_state_ind==27, "Puducherry",
                                                                                                                                     ifelse(ex_state_ind==28, "Punjab",
                                                                                                                                            ifelse(ex_state_ind==29, "Rajasthan",
                                                                                                                                                   ifelse(ex_state_ind==30, "Sikkim",
                                                                                                                                                          ifelse(ex_state_ind==31,"Tamil Nadu",
                                                                                                                                                          ifelse(ex_state_ind==32, "Tripura",
                                                                                                                                                                 ifelse(ex_state_ind==33, "Uttar Pradesh",
                                                                                                                                                                        ifelse(ex_state_ind==34, "Uttarakhand",
                                                                                                                                                                               ifelse(ex_state_ind==35, "West Bengal",
                                                                                                                                                                                      ifelse(ex_state_ind==36, "Telangana",NA )))))))))))))))))))))))))))))))))))))

                                          
india$ex_state_ind <- as.factor(india$ex_state_ind)

# add ex_district_ind with district names from Lara's List
district_names <-read_csv("District_Names_DHS_India_15_16.csv")

district_names$ex_d_name_ind <-district_names$d_name

district_names <-district_names %>%
 dplyr::select(-X3,-X4,-X5,-d_name)


india <- left_join(india,district_names, by=c("d_id"="d_id"))

#convert race
india$race <-ifelse(india$race==991,"caste",
               ifelse(india$race==992,"tribe",
                      ifelse(india$race==993,"no tribe/caste",
                             ifelse(india$race==998 |india$race==999,NA,NA))))

#check whether  biomarkers are reasonable( only measured in women <49 and men <54)

india %>%
  filter(sex==0 & is.na(sbp1)==F & age>=55) #0 observations
india %>%
  filter(sex==1 & is.na(sbp1)==F & age>=50) #0 observations
india %>%
  filter(sex==0 & is.na(fbg)==F & age>=55)#0 observations
india %>%
  filter(sex==1 & is.na(fbg)==F & age>=50) #0 observations


#change decimals
india$ht <-india$ht*0.1
india$wt <-india$wt*0.1
india$asset_index <-india$asset_index*0.00001
india$hh_wt <-india$hh_wt*0.000001

write_csv(india, "DHS India 2015-2016-cleaned with codebook.csv")

#final changes (doi,dob)

india<- read_csv("DHS India 2015-2016-cleaned with codebook.csv")

#remove doi and dob after having calculated age_c
india <- india %>%
 dplyr::select(-doi,-dob,-ineligible,-visitor)


write_csv(india, "DHS India 2015-2016-cleaned with codebook(20-02-2018.csv") 
```


